{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Creating one Standardized Dataframe from Perseus, Gorman, Pedalion, and Proiel Trees\n",
    "\n",
    "In this notebook, I will demonstrate the code used to standardize Perseus, Gorman, Pedalion, and Proiel Treebanks encoded in xml. \n",
    "\n",
    "These treebanks total to approximately one million words of Ancient Greek, but in their current formats, it is impossible to look at these treebanked words as a whole. All of the treebanks I examine use different encoding patterns, meaning that accessing the morphological and structural data in treebanks is only possible by examining the differences in encoding and tailoring queries to each separate encoded xml file. \n",
    "My aim is to collect the work that has been put into Ancient Greek Treebanks, so that the one million treebanked words are in one standardized format.\n",
    "\n",
    "(Note that this is here for replication purposes.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.express as px\n",
    "\n",
    "from bs4 import BeautifulSoup, NavigableString\n",
    "from collections import defaultdict\n",
    "\n",
    "import re\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mappings of POStags to the meanings they stand for\n",
    "# taken directly from Dr. Crane's 'TreebankCount.ipynb'\n",
    "tenses  = {'p':'present','f':'future', 'i':'imperfect','a':'aorist','r':'perfect','l':'pluperfect',\n",
    "           't':'future perfect','s':'resultative','x':'uncertain'}\n",
    "voices  = {'a':'active','m':'middle','p':'passive', 'i':'imperfect','e':'middle-passive'}\n",
    "moods   = {'i':'indicative','s':'subjunctive', 'o':'optative','m':'imperative','p':'participle',\n",
    "           'n':'infinitive', 'g':'verbal_adjective','x':'uncertain'}\n",
    "degrees = {'c':'comparative', 's': 'superlative','p':'positive'}\n",
    "numbers = {'s':'singular','p':'plural','d':'dual','x':'uncertain'}\n",
    "persons = {'1':'1st','2':'2nd','3':'3rd'}\n",
    "genders = {'c':'common','f':'feminine', 'n':'neuter','m':'masculine','x':'uncertain',\n",
    "           'p':'masculine or feminine', 'r':'feminine or neuter', 'q': 'masculine, feminine or neuter', \n",
    "           'o':'masculine or neuter'}\n",
    "cases   = {'n':'nominative', 'l':'locative','v':'vocative','g':'genitive','d':'dative',\n",
    "           'a':'accusative','x':'uncertain'}\n",
    "POSes   = {'n': 'noun','v': 'verb','t': 'participle','a': 'adjective','d': 'adverb','l': 'article','g': 'particle',\n",
    "           'c': 'conjunction','r': 'preposition','p': 'pronoun','m': 'numeral','i': 'interjection','e': 'exclamation',\n",
    "           'u': 'punctuation','x': 'irregular',\n",
    "           #proiel POStags\n",
    "           'A-': 'adjective','Df': 'adverb','S-': 'article','Ma': 'cardinal numeral','Nb': 'common noun',\n",
    "           'C-': 'conjunction','Pd': 'demonstrative pronoun','F-': 'foreign word', 'Px': 'indefinite pronoun',\n",
    "           'N-': 'infinitive marker','I-': 'interjection','Du': 'interrogative adverb','Pi': 'interrogative pronoun',\n",
    "           'Mo': 'ordinal numeral','Pp': 'personal pronoun','Pk': 'personal reflexive pronoun',\n",
    "           'Ps': 'possessive pronoun','Pt': 'possessive reflexive pronoun','R-': 'preposition','Ne': 'proper noun',\n",
    "           'Py': 'quantifier','Pc': 'reciprocal pronoun','Dq': 'relative adverb','Pr': 'relative pronoun',\n",
    "           'G-': 'subjunction','V-': 'verb','X-': 'unassigned'}\n",
    "\n",
    "strengths   = {'w': 'weak','s': 'strong','t': 'weak or strong'}\n",
    "inflections = {'n': 'non-inflecting','i': 'inflecting'}\n",
    "\n",
    "# mappings of strings in the Proiel Greek NT to TLG works\n",
    "ntworks = {\"MATT\": \"tlg001\",\"MARK\": \"tlg002\",\"LUKE\": \"tlg003\",\"JOHN\": \"tlg004\",\"ACTS\": \"tlg005\",\"ROM\": \"tlg006\",\n",
    "           \"1COR\": \"tlg007\",\"2COR\": \"tlg008\",\"GAL\": \"tlg009\",\"EPH\": \"tlg010\",\"PHIL\": \"tlg011\",\"COL\": \"tlg012\",\n",
    "           \"1THESS\": \"tlg013\",\"2THESS\": \"tlg014\",\"1TIM\": \"tlg015\",\"2TIM\": \"tlg016\",\"TIT\": \"tlg017\",\"PHILEM\": \"tlg018\",\n",
    "           \"HEB\": \"tlg019\",\"JAS\": \"tlg020\",\"1PET\": \"tlg021\",\"3JOHN\": \"tlg025\",\"JUDE\": \"tlg026\",\"REV\": \"tlg027\"}\n",
    "\n",
    "authinfo = {}\n",
    "authinfo['tlg0011'] = 'Sophocles,-5,-497,-406,poetry,drama'\n",
    "authinfo['tlg1220'] = 'Batrachomyomachia,-1,-100,-1,poetry,hexameter'\n",
    "authinfo['tlg0013'] = 'Homeric Hymns,-6,-650,-450,poetry,hexameter'\n",
    "authinfo['tlg0020'] = 'Hesiod,-8,-750,-650,poetry,hexameter'\n",
    "authinfo['tlg0026'] = 'Aeschines,-4,-389,-314,prose,orator'\n",
    "authinfo['tlg0058'] = 'Aeneas Tacticus,4,301,400,prose,misc'\n",
    "authinfo['tlg0096'] = 'Aesop,-6,-620,-564,prose,misc'\n",
    "authinfo['tlg0085'] = 'Aeschylus,-5,-525c,-455c,poetry,drama'\n",
    "authinfo['tlg0028'] = 'Antiphon,-5,-490,-411,prose,orator'\n",
    "authinfo['tlg0551'] = 'Appian,2,95,165,prose,history'\n",
    "authinfo['tlg0086'] = 'Aristotle,-4,-384,-322,prose,philosophy'\n",
    "authinfo['tlg0019'] = 'Aristophanes,-5,-446,-386,poetry,drama'\n",
    "authinfo['tlg0008'] = 'Athenaeus,3,170,223,prose,misc'\n",
    "authinfo['tlg0554'] = 'Chariton,2,101,200,prose,misc'\n",
    "authinfo['tlg0041'] = 'Chionis Epistulae,1,1,200,prose,misc'\n",
    "authinfo['tlg0627'] = 'Corpus Hippocraticum,-5,-450,-350,prose,misc'\n",
    "authinfo['tlg0014'] = 'Demosthenes,-4,-384,-322,prose,orator'\n",
    "authinfo['tlg0060'] = 'Diodorus Siculus,-1,-90c,-30c,prose,history'\n",
    "authinfo['tlg0081'] = 'Dionysius of Halicarnassus,-1,-60c,-7c,prose,history'\n",
    "authinfo['tlg0557'] = 'Epictetus,2,50,135,prose,philosopy'\n",
    "authinfo['tlg0537'] = 'Epicurus,-3,-341,-270,prose,philosophy'\n",
    "authinfo['tlg0343'] = 'Ezechiel the Tragic Poet,-2,-200,-101,poetry,drama'\n",
    "authinfo['tlg0006'] = 'Euripides,-5,-480c,-406c,poetry,drama'\n",
    "authinfo['tlg0016'] = 'Herodotus,-5,-484c,-425c,prose,history'\n",
    "authinfo['tlg0559'] = 'Heron of Alexandria,1,10,70,prose,misc'\n",
    "authinfo['tlg0010'] = 'Isocrates,-4,-436,-338,prose,orator'\n",
    "authinfo['tlg0526'] = 'Josephus,1,37,100c,prose,history'\n",
    "authinfo['tlg2003'] = 'Julian the Apostate,4,331,363,prose,misc'\n",
    "authinfo['tlg0561'] = 'Longus,2,101,200,prose,misc'\n",
    "authinfo['tlg0061'] = 'Pseudo-Lucian,3,201,400,prose,misc'\n",
    "authinfo['tlg0062'] = 'Lucian,2,125,180,prose,misc'\n",
    "authinfo['tlg0540'] = 'Lysias,-4,-445c,-380c,prose,orator'\n",
    "authinfo['tlg0541'] = 'Menander,-3,-342,-291,poetry,drama'\n",
    "authinfo['tlg0255'] = 'Mimnermus,-7,-650,-600,poetry,lyreleg'\n",
    "authinfo['tlgX208'] = 'Paeanius,4,301,400,prose,misc'\n",
    "authinfo['tlg0585'] = 'Phlegon,2,151,200,prose,misc'\n",
    "authinfo['tlg0059'] = 'Plato,-4,-428c,-347c,prose,philosophy'\n",
    "authinfo['tlg0007'] = 'Plutarch,2,46,119,prose,history'\n",
    "authinfo['tlg0543'] = 'Polybius,-2,-200,-118,prose,history'\n",
    "authinfo['tlg4029'] = 'Procopius,6,500,565,prose,history'\n",
    "authinfo['tlg0009'] = 'Sappho,-7,-630,-570,poetry,lyreleg'\n",
    "authinfo['tlg0260'] = 'Semonides,-7,-700,-601,poetry,lyreleg'\n",
    "authinfo['tlg0527'] = 'Septuagint,-2,-250,-100,prose,bible'\n",
    "authinfo['tlg0544'] = 'Sextus Empiricus,3,150,250,prose,philosophy'\n",
    "authinfo['tlg0032'] = 'Xenophon,-4,-430c,-354,prose,history'\n",
    "authinfo['tlg0005'] = 'Theocritus,-3,-300,-255,poetry,hexameter'\n",
    "authinfo['tlg0003'] = 'Thucydides,-5,-460c,-400c,prose,history'\n",
    "authinfo['tlg0093'] = 'Theophrastus,-4,-371,-287,prose,philosophy'\n",
    "authinfo['tlg0012'] = 'Homer,-8,-775c,-700c,poetry,hexameter'\n",
    "authinfo['tlg0031'] = 'New Testament,1,80,100,prose,bible'\n",
    "authinfo['tlg3143'] = 'Georgius Sphrantzes,16,1480,1550,prose,history'\n",
    "authinfo['papyri']  = 'papyri,?,?,?,prose,misc'\n",
    "authinfo['Chilia']  = ''\n",
    "authinfo['Pedalion'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change ROOT to local root\n",
    "# wget repository\n",
    "# hack to see whose computer we are\n",
    "curwd = os.getcwd()\n",
    "if(re.search('gcrane',curwd)):\n",
    "    ROOT = '/Users/gcrane/'\n",
    "    LOCALPATH = os.path.join(ROOT, 'github')\n",
    "else:\n",
    "    ROOT = \"/Users/bellahwang/Documents\"\n",
    "    LOCALPATH = os.path.join(ROOT, 'GitHub')\n",
    "    \n",
    "AGDPATH  = os.path.join(LOCALPATH, 'gAGDT', 'data', 'xml')\n",
    "GORPATH  = os.path.join(LOCALPATH, 'gorman-trees', 'public', 'xml')\n",
    "PEDPATH  = os.path.join(LOCALPATH, 'pedalion-trees', 'public', 'xml')\n",
    "PROPATH  = os.path.join(LOCALPATH, 'proiel-treebank')\n",
    "FILEPATH = os.path.join(LOCALPATH, 'treebankstats')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# taken from Dr. Crane's 'TreebankCount.ipynb' + some modifications\n",
    "def addfiles(dirname,flist):\n",
    "    for foo in sorted(os.listdir(dirname)):\n",
    "        if(re.search('proiel', dirname)):\n",
    "            if(not re.search('(chron|greek-nt|hdt)\\.xml', foo)):\n",
    "                continue\n",
    "        elif(not re.search('\\.xml$', foo)):\n",
    "            continue\n",
    "        newpath = os.path.join(dirname, foo)\n",
    "        flist.append(newpath)\n",
    "    return(flist)\n",
    "\n",
    "searchfiles = []\n",
    "searchfiles = addfiles(AGDPATH, searchfiles)\n",
    "searchfiles = addfiles(PEDPATH, searchfiles)\n",
    "searchfiles = addfiles(GORPATH, searchfiles)\n",
    "searchfiles = addfiles(PROPATH, searchfiles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "authList = defaultdict(list)\n",
    "PostagExcept = ['\"\"', '_', '-', '', '--------']\n",
    "PostagMistake = ['m-------', 'c-------', 'b-------', 'd-------', 'c']\n",
    "\n",
    "def findAll(FILENAME):\n",
    "    authTLG = '-'\n",
    "    workTLG = '-'\n",
    "    returnList = []\n",
    "    with open(FILENAME, 'r', encoding=\"utf-8\") as f:\n",
    "        soup = BeautifulSoup(f, \"xml\")\n",
    "        \n",
    "        # for proiel trees\n",
    "        if (re.search(\"proiel\", FILENAME)):\n",
    "            if (re.search(\"greek-nt\\.xml\", FILENAME)):\n",
    "                authTLG = 'tlg0031'\n",
    "            elif (re.search(\"chron\\.xml\", FILENAME)):\n",
    "                authTLG = 'tlg3143'\n",
    "                workTLG = 'tlg001'\n",
    "            elif (re.search(\"hdt\\.xml\", FILENAME)):\n",
    "                authTLG = 'tlg0016'\n",
    "                workTLG = 'tlg001'\n",
    "                \n",
    "            for sentence in soup('sentence'):\n",
    "                sentID = sentence['id']\n",
    "                docID  = '-'\n",
    "                author = '-'\n",
    "                for token in sentence('token'):    \n",
    "                    wordID = token['id']\n",
    "                    if token.has_attr('head-id'):\n",
    "                        head = token['head-id']\n",
    "                    else:\n",
    "                        head = '-'\n",
    "                    if token.has_attr('form'):\n",
    "                        form = token['form']\n",
    "                    else:\n",
    "                        form = '-'\n",
    "                    if token.has_attr('lemma'):\n",
    "                        lemma = token['lemma']\n",
    "                    else:\n",
    "                        lemma = '-'\n",
    "                    \n",
    "                    # ignores book 1 of herodotus\n",
    "                    if token.has_attr('citation-part'):\n",
    "                        subdoc = token['citation-part']\n",
    "                        if(len(subdoc.split())>1):\n",
    "                            if(authTLG == 'tlg0031'):                                \n",
    "                                args = subdoc.split()\n",
    "                                workname = args[0]\n",
    "                                if(workname in ntworks):\n",
    "                                    workTLG = ntworks[workname]\n",
    "                                else:\n",
    "                                    print('bad NT ref:',subdoc)\n",
    "                            else:\n",
    "                                print('doublearg',args)\n",
    "                        book    = subdoc.split('.')[0]\n",
    "                        if book == '1' and authTLG == 'tlg0016':\n",
    "                            continue\n",
    "                    else:\n",
    "                        subdoc  = '-'\n",
    "                    if token.has_attr('relation'):\n",
    "                        relation = token['relation']\n",
    "                    else:\n",
    "                        relation = '-'\n",
    "                    ref = '-'\n",
    "                    if token.has_attr('presentation-after'):\n",
    "                        presentation_after = token['presentation-after']\n",
    "                    else:\n",
    "                        presentation_after = '-'\n",
    "                    if token.has_attr('empty-token-sort'):\n",
    "                        emptyTokenSort = token['empty-token-sort']\n",
    "                    else:\n",
    "                        emptyTokenSort = '-'\n",
    "                        \n",
    "                    insertionID = '-'\n",
    "                    artificial  = '-'\n",
    "                    \n",
    "                    if token.has_attr('gloss'):\n",
    "                        gloss = token['gloss']\n",
    "                    else:\n",
    "                        gloss = '-'\n",
    "                        \n",
    "                    sem = '-'\n",
    "                    \n",
    "                    slashList = []\n",
    "                    for slash in token('slash'):\n",
    "                        indivList = []\n",
    "                        \n",
    "                        target_id = slash['target-id']\n",
    "                        relation = slash['relation']\n",
    "\n",
    "                        indivList.append(target_id)\n",
    "                        indivList.append(relation)\n",
    "                        slashList.append(indivList)\n",
    "                        \n",
    "                    if not slashList:\n",
    "                        slashList = '-'\n",
    "                    \n",
    "                    if token.has_attr('part-of-speech'):\n",
    "                        pos = token['part-of-speech']\n",
    "                    else:\n",
    "                        pos = '-'\n",
    "                    if token.has_attr('morphology'):\n",
    "                        postag     = token['morphology']\n",
    "                        person     = postag[0]\n",
    "                        number     = postag[1]\n",
    "                        tense      = postag[2]\n",
    "                        mood       = postag[3]\n",
    "                        voice      = postag[4]\n",
    "                        gender     = postag[5]\n",
    "                        case       = postag[6]\n",
    "                        degree     = postag[7]\n",
    "                        strength   = postag[8]\n",
    "                        inflection = postag[9]\n",
    "                    else:\n",
    "                        postag     = '-'\n",
    "                        pos        = '-'\n",
    "                        person     = '-'\n",
    "                        number     = '-'\n",
    "                        tense      = '-'\n",
    "                        mood       = '-'\n",
    "                        voice      = '-'\n",
    "                        gender     = '-'\n",
    "                        case       = '-'\n",
    "                        degree     = '-'\n",
    "                        strength   = '-'\n",
    "                        inflection = '-'\n",
    "        \n",
    "                    wordList = []\n",
    "                    wordList.append(authinfo[authTLG].split(',')[0])\n",
    "                    wordList.append(authTLG)\n",
    "                    wordList.append(workTLG)\n",
    "                    if not authinfo[authTLG] == '':\n",
    "                        wordList.append(authinfo[authTLG].split(',')[1])\n",
    "                        wordList.append(authinfo[authTLG].split(',')[2])\n",
    "                        wordList.append(authinfo[authTLG].split(',')[3])\n",
    "                        wordList.append(authinfo[authTLG].split(',')[4])\n",
    "                        wordList.append(authinfo[authTLG].split(',')[5])\n",
    "                    else:\n",
    "                        wordList.append('-')\n",
    "                        wordList.append('-')\n",
    "                        wordList.append('-')\n",
    "                        wordList.append('-')\n",
    "                        wordList.append('misc') #genre\n",
    "                    wordList.append(sentID)\n",
    "                    wordList.append(docID)\n",
    "                    wordList.append(subdoc)\n",
    "                    wordList.append(author)\n",
    "                    wordList.append(wordID)\n",
    "                    wordList.append(head)\n",
    "                    wordList.append(form)\n",
    "                    wordList.append(lemma)\n",
    "                    wordList.append(relation)\n",
    "                    wordList.append(ref)\n",
    "                    wordList.append(presentation_after)\n",
    "                    wordList.append(insertionID)\n",
    "                    wordList.append(artificial)\n",
    "                    wordList.append(gloss)\n",
    "                    wordList.append(sem)\n",
    "                    wordList.append(slashList)\n",
    "                    wordList.append(postag)\n",
    "                    wordList.append(pos)\n",
    "                    wordList.append(person)\n",
    "                    wordList.append(number)\n",
    "                    wordList.append(tense)\n",
    "                    wordList.append(mood)\n",
    "                    wordList.append(voice)\n",
    "                    wordList.append(gender)\n",
    "                    wordList.append(case)\n",
    "                    wordList.append(degree)\n",
    "                    wordList.append(strength)\n",
    "                    wordList.append(inflection)\n",
    "                    returnList.append(wordList)\n",
    "        \n",
    "        # for gorman, pedalion, and gAGDT trees\n",
    "            \n",
    "        else:\n",
    "            if(re.search('example-sentences', FILENAME)):\n",
    "                return\n",
    "            else:\n",
    "                if(re.search(\"papyri\", FILENAME)):\n",
    "                    authTLG = 'papyri'\n",
    "                    \n",
    "                for sentence in soup('sentence'):\n",
    "                    sentID = sentence['id']\n",
    "                    docID  = sentence['document_id']\n",
    "                    if sentence.has_attr('subdoc'):\n",
    "                        subdoc  = sentence['subdoc']\n",
    "                    else:\n",
    "                        subdoc  = '-'\n",
    "                    if sentence.has_attr('Author'):\n",
    "                        author = sentence['Author']\n",
    "                    else:\n",
    "                        author = '-'\n",
    "                    # find author ID\n",
    "                    if (\"urn:cts:greekLit:\" in docID):\n",
    "                        docIDList = docID.split(':')\n",
    "                        if (\"tlg\" in docIDList[3]):\n",
    "                            authTLG = docIDList[3].split('.')[0]\n",
    "                            workTLG = docIDList[3].split('.')[1]\n",
    "                        else:\n",
    "                            authTLG = docIDList[-1].split('.')[0]\n",
    "                            workTLG = docIDList[-1].split('.')[1]\n",
    "                    elif (\"Perseus:text:\" in docID):\n",
    "                        authTLG = 'tlg0008'\n",
    "                        workTLG = 'tlg001'\n",
    "                    elif (re.search(r'....-...', docID)):\n",
    "                        authTLG = 'tlg' + docID.split('-')[0]\n",
    "                        workTLG = 'tlg' + docID.split('-')[1]\n",
    "                    elif (\"NT\" in docID):\n",
    "                        authTLG = 'tlg0031'\n",
    "                    elif (\"Ps-Luc\" in docID):\n",
    "                        authTLG = 'tlg0061'\n",
    "                    elif (\"Paean\" in docID):\n",
    "                        authTLG = 'tlgX208'\n",
    "                    elif (\"Genesis\" in docID):\n",
    "                        authTLG = 'tlg0527'\n",
    "                    elif (\"Chilia\" in docID):\n",
    "                        authTLG = 'Chilia'\n",
    "                    elif (\"Pedalion\" in docID):\n",
    "                        authTLG = 'Pedalion'\n",
    "                    elif (\"Mimn\" in docID):\n",
    "                        authTLG = 'tlg0255'\n",
    "                    elif (\"0260\" in docID):\n",
    "                        authTLG = 'tlg0260'\n",
    "                    elif (\"0005\" in docID):\n",
    "                        authTLG = 'tlg0005'\n",
    "\n",
    "                    for word in sentence('word'):    \n",
    "                        wordID = word['id']\n",
    "                        head = word['head']\n",
    "                        if word.has_attr('form'):\n",
    "                            form = word['form']\n",
    "                        else:\n",
    "                            form = '-'\n",
    "                        if word.has_attr('lemma'):\n",
    "                            lemma = word['lemma']\n",
    "                        else:\n",
    "                            lemma = '-'\n",
    "                        if word.has_attr('relation'):\n",
    "                            relation = word['relation']\n",
    "                        else:\n",
    "                            relation = '-'\n",
    "                        if word.has_attr('ref'):\n",
    "                            ref = word['ref']\n",
    "                        else:\n",
    "                            ref = '-'\n",
    "                        presentation_after = '-'\n",
    "                        if word.has_attr('insertion_id'):\n",
    "                            insertionID = word['insertion_id']\n",
    "                        else:\n",
    "                            insertionID = '-'\n",
    "                        if word.has_attr('artificial'):\n",
    "                            artificial = word['artificial']\n",
    "                        else:\n",
    "                            artificial = '-'\n",
    "                        if word.has_attr('gloss'):\n",
    "                            gloss = word['gloss']\n",
    "                        else:\n",
    "                            gloss = '-' \n",
    "                        if word.has_attr('sem'):\n",
    "                            sem = word['sem']\n",
    "                        else:\n",
    "                            sem = '-'\n",
    "\n",
    "                        slashList = '-'\n",
    "\n",
    "                        if word.has_attr('postag'):\n",
    "                            postag = word['postag']\n",
    "                            if postag in PostagExcept:\n",
    "                                postag     = '-'\n",
    "                                pos        = '-'\n",
    "                                person     = '-'\n",
    "                                number     = '-'\n",
    "                                tense      = '-'\n",
    "                                mood       = '-'\n",
    "                                voice      = '-'\n",
    "                                gender     = '-'\n",
    "                                case       = '-'\n",
    "                                degree     = '-'\n",
    "                                strength   = '-'\n",
    "                                inflection = '-'\n",
    "                            elif postag in PostagMistake:\n",
    "                                postag     = postag[0] + '--------'\n",
    "                                pos        = postag[0]\n",
    "                                person     = '-'\n",
    "                                number     = '-'\n",
    "                                tense      = '-'\n",
    "                                mood       = '-'\n",
    "                                voice      = '-'\n",
    "                                gender     = '-'\n",
    "                                case       = '-'\n",
    "                                degree     = '-'\n",
    "                                strength   = '-'\n",
    "                                inflection = '-'\n",
    "                            elif postag == 'm-p---na':\n",
    "                                postag     = 'm-p---na-'\n",
    "                                pos        = postag[0]\n",
    "                                person     = postag[1]\n",
    "                                number     = postag[2]\n",
    "                                tense      = postag[3]\n",
    "                                mood       = postag[4]\n",
    "                                voice      = postag[5]\n",
    "                                gender     = postag[6]\n",
    "                                case       = postag[7]\n",
    "                                degree     = postag[8]\n",
    "                                strength   = '-'\n",
    "                                inflection = '-'\n",
    "                            elif postag == 'v2pasm':\n",
    "                                postag     = 'v2pasm---'\n",
    "                                pos        = postag[0]\n",
    "                                person     = postag[1]\n",
    "                                number     = postag[2]\n",
    "                                tense      = postag[3]\n",
    "                                mood       = postag[4]\n",
    "                                voice      = postag[5]\n",
    "                                gender     = postag[6]\n",
    "                                case       = postag[7]\n",
    "                                degree     = postag[8]\n",
    "                                strength   = '-'\n",
    "                                inflection = '-'\n",
    "                            else:\n",
    "                                #print(postag)\n",
    "                                pos        = postag[0]\n",
    "                                person     = postag[1]\n",
    "                                number     = postag[2]\n",
    "                                tense      = postag[3]\n",
    "                                mood       = postag[4]\n",
    "                                voice      = postag[5]\n",
    "                                gender     = postag[6]\n",
    "                                #if(not postag[7] in cases):\n",
    "                                #    print('postag',postag)\n",
    "                                case       = postag[7]\n",
    "                                degree     = postag[8]\n",
    "                                strength   = '-'\n",
    "                                inflection = '-'\n",
    "                        else:\n",
    "                            postag     = '-'\n",
    "                            pos        = '-'\n",
    "                            person     = '-'\n",
    "                            number     = '-'\n",
    "                            tense      = '-'\n",
    "                            mood       = '-'\n",
    "                            voice      = '-'\n",
    "                            gender     = '-'\n",
    "                            case       = '-'\n",
    "                            degree     = '-'\n",
    "                            strength   = '-'\n",
    "                            inflection = '-'\n",
    "\n",
    "                        wordList = []\n",
    "                        wordList.append(authinfo[authTLG].split(',')[0])\n",
    "                        wordList.append(authTLG)\n",
    "                        wordList.append(workTLG)\n",
    "                        if not authinfo[authTLG] == '':\n",
    "                            wordList.append(authinfo[authTLG].split(',')[1])\n",
    "                            wordList.append(authinfo[authTLG].split(',')[2])\n",
    "                            wordList.append(authinfo[authTLG].split(',')[3])\n",
    "                            wordList.append(authinfo[authTLG].split(',')[4])\n",
    "                            wordList.append(authinfo[authTLG].split(',')[5])\n",
    "                        else:\n",
    "                            wordList.append('-')\n",
    "                            wordList.append('-')\n",
    "                            wordList.append('-')\n",
    "                            wordList.append('-')\n",
    "                            wordList.append('-')\n",
    "                        wordList.append(sentID)\n",
    "                        wordList.append(docID)\n",
    "                        wordList.append(subdoc)\n",
    "                        wordList.append(author)\n",
    "                        wordList.append(wordID)\n",
    "                        wordList.append(head)\n",
    "                        wordList.append(form)\n",
    "                        wordList.append(lemma)\n",
    "                        wordList.append(relation)\n",
    "                        wordList.append(ref)\n",
    "                        wordList.append(presentation_after)\n",
    "                        wordList.append(insertionID)\n",
    "                        wordList.append(artificial)\n",
    "                        wordList.append(gloss)\n",
    "                        wordList.append(sem)\n",
    "                        wordList.append(slashList)\n",
    "                        wordList.append(postag)\n",
    "                        wordList.append(pos)\n",
    "                        wordList.append(person)\n",
    "                        wordList.append(number)\n",
    "                        wordList.append(tense)\n",
    "                        wordList.append(mood)\n",
    "                        wordList.append(voice)\n",
    "                        wordList.append(gender)\n",
    "                        wordList.append(case)\n",
    "                        wordList.append(degree)\n",
    "                        wordList.append(strength)\n",
    "                        wordList.append(inflection)\n",
    "                        returnList.append(wordList)\n",
    "\n",
    "    return returnList"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0003.tlg001.perseus-grc1.1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg001.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg002.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg003.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg004.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg005.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg001.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg002.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0013.tlg002.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg001.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg002.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg003.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg001.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg002.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg003.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg004.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg005.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg006.perseus-grc2.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg007.perseus-grc1.tb.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/achar.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/aeneas.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/aesop1.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/batracho.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/charb1.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/chilia-sentences.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/chion.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/crit.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/epictetus.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/epicurus1.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/euripides_medea.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/example-sentences.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/external_examplesentences.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/ez.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/genesis1.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/genesis2.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/genesis3.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/heron.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/iso.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/julian.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/longus.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/lucian_lis.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/lucian_prometheus.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/lucian_symposion.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/lysias_or24.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/menander_dyskolos.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/mimn.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/paean.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/papyri.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/phlegon.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/procopius.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/pseudo-lucian_themule.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/pseudoplato_cleitophon.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/sappho.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/semonides.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/sextus.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/theoc.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/theophr.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/thesmo.xml\n",
      "/Users/bellahwang/Documents/GitHub/pedalion-trees/public/xml/xenmem.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-1-50-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-101-150-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-151-196-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-51-100-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-1-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-2-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-5-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-6-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-0-1-4-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-11-14-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-5-7-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-8-10-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aristotle-politics-book-1-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aristotle-politics-book-2-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-1-9-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-10-19-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-20-29-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-20-29-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-30-39-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-30-39-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-40-49-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-40-49-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-50-59-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-50-59-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-60-69-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-60-69-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-70-81-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen12-70-81-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-1-9-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-1-9-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-10-19-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-10-19-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-20-29-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-20-29-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-30-39-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-30-39-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-40-49-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-40-49-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-50-59-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-50-59-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-60-69-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-60-69-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-70-79-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-70-79-jan-15.xml\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-80-89-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-80-89-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-90-95-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-90-95-jan-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dem-59-neaira-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-1-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-1-50-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-101-150-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-151-200-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-201-275-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-276-324-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-51-100-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-4-phil1-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-46-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-47-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-49-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-50-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-52-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-53-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/diodsic-11-1-20-bu4.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/diodsic-11-81-92-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/diodsic11-21-40-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/diodsic11-41-60-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/diodsic11-61-80-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-1-15-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-16-30-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-31-45-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-46-60-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-61-75-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dion-hal-1-76-90-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-1-19-bu3-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-100-119-bu3-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-120-149-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-150-169-bu3-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-170-189-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-190-216-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-20-39-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-40-59-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-60-79-bu2-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/hdt-1-80-99-bu5-2019.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-1-2-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-11-15-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-16-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-21-25-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-3-5-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/josephus-bj-1-6-10-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-1-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-12-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-13-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-14-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-15.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-19-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/lysias-23-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plato-apology.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-alcib-1-17-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-alcib-18-39-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-fortuna-romanorum-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-alex-fort-aut-virt-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-1-15-bu4.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-16-31-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-1-10-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-11-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-21-35-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-36-49-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-1-10-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-11-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-21-30-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-31-40-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-41-50-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-51-60-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-61-71-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-21-1-10-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-21-11-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-21-21-30-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-21-31-47-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-6-16-30-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-6-2-15-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-6-31-45-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-6-46-58-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-9-1-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-9-21-33-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-9-34-45-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-1-9-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-10-19-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-20-29-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-30-39-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-40-49-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-50-59-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-60-69-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-70-79-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius1-80-88-2017.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/ps-xen-ath-pol-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-1-20-bu5.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-101-120-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-121-146-bu3.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-21-40-bu4.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-41-60-bu3.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-61-80-bu3.xml\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-81-100-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-3-1-20-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-3-21-40-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-1-2-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-3-4-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-5-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-6-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-7-1-3-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-7-4-5-tree.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-1-8-4-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-5-7-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-8-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-hell-1-1-4-bu2.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-hell-1-5-7-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-hell-2-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-hell-3-bu1.xml\n",
      "/Users/bellahwang/Documents/GitHub/proiel-treebank/chron.xml\n",
      "/Users/bellahwang/Documents/GitHub/proiel-treebank/greek-nt.xml\n",
      "/Users/bellahwang/Documents/GitHub/proiel-treebank/hdt.xml\n"
     ]
    }
   ],
   "source": [
    "allDFList = []\n",
    "for i in searchfiles:\n",
    "    print(i)\n",
    "    data = findAll(i)\n",
    "    df = pd.DataFrame(data, columns = ['Author', 'AuthorTLG', 'WorkTLG', 'Timeline', 'StartDate', 'EndDate', \\\n",
    "        'Poetry/Prose', 'Genre', 'sentID', 'docID', 'subdoc', 'AuthorName', 'wordID', \\\n",
    "        'head', 'form', 'lemma', 'relation', 'ref', 'presentation_after', 'insertionID', 'artificial', \\\n",
    "        'gloss', 'sem', 'slash', 'postag', 'pos', 'person', 'number', 'tense', 'mood', 'voice', 'gender', 'case', \\\n",
    "        'degree', 'strength', 'inflection'])\n",
    "    allDFList.append(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Author</th>\n",
       "      <th>AuthorTLG</th>\n",
       "      <th>WorkTLG</th>\n",
       "      <th>Timeline</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>EndDate</th>\n",
       "      <th>Poetry/Prose</th>\n",
       "      <th>Genre</th>\n",
       "      <th>sentID</th>\n",
       "      <th>docID</th>\n",
       "      <th>subdoc</th>\n",
       "      <th>AuthorName</th>\n",
       "      <th>wordID</th>\n",
       "      <th>head</th>\n",
       "      <th>form</th>\n",
       "      <th>lemma</th>\n",
       "      <th>relation</th>\n",
       "      <th>ref</th>\n",
       "      <th>presentation_after</th>\n",
       "      <th>insertionID</th>\n",
       "      <th>artificial</th>\n",
       "      <th>gloss</th>\n",
       "      <th>sem</th>\n",
       "      <th>slash</th>\n",
       "      <th>postag</th>\n",
       "      <th>pos</th>\n",
       "      <th>person</th>\n",
       "      <th>number</th>\n",
       "      <th>tense</th>\n",
       "      <th>mood</th>\n",
       "      <th>voice</th>\n",
       "      <th>gender</th>\n",
       "      <th>case</th>\n",
       "      <th>degree</th>\n",
       "      <th>strength</th>\n",
       "      <th>inflection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>SBJ</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>ξυνέγραψε</td>\n",
       "      <td>συγγράφω</td>\n",
       "      <td>PRED</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>v3saia---</td>\n",
       "      <td>v</td>\n",
       "      <td>3</td>\n",
       "      <td>s</td>\n",
       "      <td>a</td>\n",
       "      <td>i</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>τὸν</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>l-s---ma-</td>\n",
       "      <td>l</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>πόλεμον</td>\n",
       "      <td>πόλεμος</td>\n",
       "      <td>OBJ_AP</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---ma-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495803</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022394</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>adv</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>---------n</td>\n",
       "      <td>R-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495804</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022395</td>\n",
       "      <td>1022396</td>\n",
       "      <td>τῇσι</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>aux</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>S-</td>\n",
       "      <td>-</td>\n",
       "      <td>p</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>f</td>\n",
       "      <td>d</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495805</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022396</td>\n",
       "      <td>1022394</td>\n",
       "      <td>Ἀφέτῃσι</td>\n",
       "      <td>Ἀφέται</td>\n",
       "      <td>obl</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>Ne</td>\n",
       "      <td>-</td>\n",
       "      <td>p</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>f</td>\n",
       "      <td>d</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495806</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022397</td>\n",
       "      <td>-</td>\n",
       "      <td>ἐποιεῦντο</td>\n",
       "      <td>ποιέω</td>\n",
       "      <td>pred</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>3piim----i</td>\n",
       "      <td>V-</td>\n",
       "      <td>3</td>\n",
       "      <td>p</td>\n",
       "      <td>i</td>\n",
       "      <td>i</td>\n",
       "      <td>m</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495807</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022398</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἀριθμόν</td>\n",
       "      <td>ἀριθμός</td>\n",
       "      <td>obj</td>\n",
       "      <td>-</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-s---ma--i</td>\n",
       "      <td>Nb</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1495808 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             Author AuthorTLG WorkTLG Timeline StartDate EndDate Poetry/Prose  \\\n",
       "0        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "1        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "2        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "3        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "4        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "...             ...       ...     ...      ...       ...     ...          ...   \n",
       "1495803   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495804   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495805   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495806   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495807   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "\n",
       "           Genre  sentID                                         docID subdoc  \\\n",
       "0        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "1        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "2        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "3        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "4        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "...          ...     ...                                           ...    ...   \n",
       "1495803  history  225174                                             -  8.7.2   \n",
       "1495804  history  225174                                             -  8.7.2   \n",
       "1495805  history  225174                                             -  8.7.2   \n",
       "1495806  history  225174                                             -  8.7.2   \n",
       "1495807  history  225174                                             -  8.7.2   \n",
       "\n",
       "        AuthorName   wordID     head        form       lemma relation ref  \\\n",
       "0                -        1        3  Θουκυδίδης  Θουκυδίδης      SBJ   -   \n",
       "1                -        2        1    Ἀθηναῖος    Ἀθηναῖος      ATR   -   \n",
       "2                -        3        0   ξυνέγραψε    συγγράφω     PRED   -   \n",
       "3                -        4        5         τὸν           ὁ      ATR   -   \n",
       "4                -        5       10     πόλεμον     πόλεμος   OBJ_AP   -   \n",
       "...            ...      ...      ...         ...         ...      ...  ..   \n",
       "1495803          -  1022394  1022397          ἐν          ἐν      adv   -   \n",
       "1495804          -  1022395  1022396        τῇσι           ὁ      aux   -   \n",
       "1495805          -  1022396  1022394     Ἀφέτῃσι      Ἀφέται      obl   -   \n",
       "1495806          -  1022397        -   ἐποιεῦντο       ποιέω     pred   -   \n",
       "1495807          -  1022398  1022397     ἀριθμόν     ἀριθμός      obj   -   \n",
       "\n",
       "        presentation_after insertionID artificial gloss sem slash      postag  \\\n",
       "0                        -           -          -     -   -     -   n-s---mn-   \n",
       "1                        -           -          -     -   -     -   n-s---mn-   \n",
       "2                        -           -          -     -   -     -   v3saia---   \n",
       "3                        -           -          -     -   -     -   l-s---ma-   \n",
       "4                        -           -          -     -   -     -   n-s---ma-   \n",
       "...                    ...         ...        ...   ...  ..   ...         ...   \n",
       "1495803                              -          -     -   -     -  ---------n   \n",
       "1495804                              -          -     -   -     -  -p---fd--i   \n",
       "1495805                              -          -     -   -     -  -p---fd--i   \n",
       "1495806                              -          -     -   -     -  3piim----i   \n",
       "1495807                 .            -          -     -   -     -  -s---ma--i   \n",
       "\n",
       "        pos person number tense mood voice gender case degree strength  \\\n",
       "0         n      -      s     -    -     -      m    n      -        -   \n",
       "1         n      -      s     -    -     -      m    n      -        -   \n",
       "2         v      3      s     a    i     a      -    -      -        -   \n",
       "3         l      -      s     -    -     -      m    a      -        -   \n",
       "4         n      -      s     -    -     -      m    a      -        -   \n",
       "...      ..    ...    ...   ...  ...   ...    ...  ...    ...      ...   \n",
       "1495803  R-      -      -     -    -     -      -    -      -        -   \n",
       "1495804  S-      -      p     -    -     -      f    d      -        -   \n",
       "1495805  Ne      -      p     -    -     -      f    d      -        -   \n",
       "1495806  V-      3      p     i    i     m      -    -      -        -   \n",
       "1495807  Nb      -      s     -    -     -      m    a      -        -   \n",
       "\n",
       "        inflection  \n",
       "0                -  \n",
       "1                -  \n",
       "2                -  \n",
       "3                -  \n",
       "4                -  \n",
       "...            ...  \n",
       "1495803          n  \n",
       "1495804          i  \n",
       "1495805          i  \n",
       "1495806          i  \n",
       "1495807          i  \n",
       "\n",
       "[1495808 rows x 36 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "master_df = pd.concat(allDFList, ignore_index=True)\n",
    "\n",
    "# gets rid of ... in dataframe and displays all of it\n",
    "pd.set_option('display.max_columns', None)\n",
    "\n",
    "master_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Author</th>\n",
       "      <th>AuthorTLG</th>\n",
       "      <th>WorkTLG</th>\n",
       "      <th>Timeline</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>EndDate</th>\n",
       "      <th>Poetry/Prose</th>\n",
       "      <th>Genre</th>\n",
       "      <th>sentID</th>\n",
       "      <th>docID</th>\n",
       "      <th>subdoc</th>\n",
       "      <th>AuthorName</th>\n",
       "      <th>wordID</th>\n",
       "      <th>head</th>\n",
       "      <th>form</th>\n",
       "      <th>lemma</th>\n",
       "      <th>relation</th>\n",
       "      <th>ref</th>\n",
       "      <th>presentation_after</th>\n",
       "      <th>insertionID</th>\n",
       "      <th>artificial</th>\n",
       "      <th>gloss</th>\n",
       "      <th>sem</th>\n",
       "      <th>slash</th>\n",
       "      <th>postag</th>\n",
       "      <th>pos</th>\n",
       "      <th>person</th>\n",
       "      <th>number</th>\n",
       "      <th>tense</th>\n",
       "      <th>mood</th>\n",
       "      <th>voice</th>\n",
       "      <th>gender</th>\n",
       "      <th>case</th>\n",
       "      <th>degree</th>\n",
       "      <th>strength</th>\n",
       "      <th>inflection</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>SBJ</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>ξυνέγραψε</td>\n",
       "      <td>συγγράφω</td>\n",
       "      <td>PRED</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>v3saia---</td>\n",
       "      <td>v</td>\n",
       "      <td>3</td>\n",
       "      <td>s</td>\n",
       "      <td>a</td>\n",
       "      <td>i</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>τὸν</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>l-s---ma-</td>\n",
       "      <td>l</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>πόλεμον</td>\n",
       "      <td>πόλεμος</td>\n",
       "      <td>OBJ_AP</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---ma-</td>\n",
       "      <td>n</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495803</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022394</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>adv</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>---------n</td>\n",
       "      <td>R-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495804</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022395</td>\n",
       "      <td>1022396</td>\n",
       "      <td>τῇσι</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>aux</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>S-</td>\n",
       "      <td>-</td>\n",
       "      <td>p</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>f</td>\n",
       "      <td>d</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495805</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022396</td>\n",
       "      <td>1022394</td>\n",
       "      <td>Ἀφέτῃσι</td>\n",
       "      <td>Ἀφέται</td>\n",
       "      <td>obl</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>Ne</td>\n",
       "      <td>-</td>\n",
       "      <td>p</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>f</td>\n",
       "      <td>d</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495806</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022397</td>\n",
       "      <td>-</td>\n",
       "      <td>ἐποιεῦντο</td>\n",
       "      <td>ποιέω</td>\n",
       "      <td>pred</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>3piim----i</td>\n",
       "      <td>V-</td>\n",
       "      <td>3</td>\n",
       "      <td>p</td>\n",
       "      <td>i</td>\n",
       "      <td>i</td>\n",
       "      <td>m</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495807</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022398</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἀριθμόν</td>\n",
       "      <td>ἀριθμός</td>\n",
       "      <td>obj</td>\n",
       "      <td>-</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-s---ma--i</td>\n",
       "      <td>Nb</td>\n",
       "      <td>-</td>\n",
       "      <td>s</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>m</td>\n",
       "      <td>a</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1489845 rows × 36 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             Author AuthorTLG WorkTLG Timeline StartDate EndDate Poetry/Prose  \\\n",
       "0        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "1        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "2        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "3        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "4        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "...             ...       ...     ...      ...       ...     ...          ...   \n",
       "1495803   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495804   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495805   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495806   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495807   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "\n",
       "           Genre  sentID                                         docID subdoc  \\\n",
       "0        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "1        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "2        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "3        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "4        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "...          ...     ...                                           ...    ...   \n",
       "1495803  history  225174                                             -  8.7.2   \n",
       "1495804  history  225174                                             -  8.7.2   \n",
       "1495805  history  225174                                             -  8.7.2   \n",
       "1495806  history  225174                                             -  8.7.2   \n",
       "1495807  history  225174                                             -  8.7.2   \n",
       "\n",
       "        AuthorName   wordID     head        form       lemma relation ref  \\\n",
       "0                -        1        3  Θουκυδίδης  Θουκυδίδης      SBJ   -   \n",
       "1                -        2        1    Ἀθηναῖος    Ἀθηναῖος      ATR   -   \n",
       "2                -        3        0   ξυνέγραψε    συγγράφω     PRED   -   \n",
       "3                -        4        5         τὸν           ὁ      ATR   -   \n",
       "4                -        5       10     πόλεμον     πόλεμος   OBJ_AP   -   \n",
       "...            ...      ...      ...         ...         ...      ...  ..   \n",
       "1495803          -  1022394  1022397          ἐν          ἐν      adv   -   \n",
       "1495804          -  1022395  1022396        τῇσι           ὁ      aux   -   \n",
       "1495805          -  1022396  1022394     Ἀφέτῃσι      Ἀφέται      obl   -   \n",
       "1495806          -  1022397        -   ἐποιεῦντο       ποιέω     pred   -   \n",
       "1495807          -  1022398  1022397     ἀριθμόν     ἀριθμός      obj   -   \n",
       "\n",
       "        presentation_after insertionID artificial gloss sem slash      postag  \\\n",
       "0                        -           -          -     -   -     -   n-s---mn-   \n",
       "1                        -           -          -     -   -     -   n-s---mn-   \n",
       "2                        -           -          -     -   -     -   v3saia---   \n",
       "3                        -           -          -     -   -     -   l-s---ma-   \n",
       "4                        -           -          -     -   -     -   n-s---ma-   \n",
       "...                    ...         ...        ...   ...  ..   ...         ...   \n",
       "1495803                              -          -     -   -     -  ---------n   \n",
       "1495804                              -          -     -   -     -  -p---fd--i   \n",
       "1495805                              -          -     -   -     -  -p---fd--i   \n",
       "1495806                              -          -     -   -     -  3piim----i   \n",
       "1495807                 .            -          -     -   -     -  -s---ma--i   \n",
       "\n",
       "        pos person number tense mood voice gender case degree strength  \\\n",
       "0         n      -      s     -    -     -      m    n      -        -   \n",
       "1         n      -      s     -    -     -      m    n      -        -   \n",
       "2         v      3      s     a    i     a      -    -      -        -   \n",
       "3         l      -      s     -    -     -      m    a      -        -   \n",
       "4         n      -      s     -    -     -      m    a      -        -   \n",
       "...      ..    ...    ...   ...  ...   ...    ...  ...    ...      ...   \n",
       "1495803  R-      -      -     -    -     -      -    -      -        -   \n",
       "1495804  S-      -      p     -    -     -      f    d      -        -   \n",
       "1495805  Ne      -      p     -    -     -      f    d      -        -   \n",
       "1495806  V-      3      p     i    i     m      -    -      -        -   \n",
       "1495807  Nb      -      s     -    -     -      m    a      -        -   \n",
       "\n",
       "        inflection  \n",
       "0                -  \n",
       "1                -  \n",
       "2                -  \n",
       "3                -  \n",
       "4                -  \n",
       "...            ...  \n",
       "1495803          n  \n",
       "1495804          i  \n",
       "1495805          i  \n",
       "1495806          i  \n",
       "1495807          i  \n",
       "\n",
       "[1489845 rows x 36 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replaces inconsistent null values with NaN\n",
    "# ignores Chilia, Pedalion, and papyri in terms of authors\n",
    "master_df['Author'].replace('', np.nan, inplace=True)\n",
    "master_df.replace('-', np.nan, inplace=True)\n",
    "master_df.replace('_', np.nan, inplace=True)\n",
    "master_df.replace('?', np.nan, inplace=True)\n",
    "master_df.dropna(subset=['Author'], inplace=True)\n",
    "\n",
    "# added this to prevent visualizations from complaining about NaN values\n",
    "master_df.fillna('-', inplace=True)\n",
    "\n",
    "master_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Author</th>\n",
       "      <th>AuthorTLG</th>\n",
       "      <th>WorkTLG</th>\n",
       "      <th>Timeline</th>\n",
       "      <th>StartDate</th>\n",
       "      <th>EndDate</th>\n",
       "      <th>Poetry/Prose</th>\n",
       "      <th>Genre</th>\n",
       "      <th>sentID</th>\n",
       "      <th>docID</th>\n",
       "      <th>subdoc</th>\n",
       "      <th>AuthorName</th>\n",
       "      <th>wordID</th>\n",
       "      <th>head</th>\n",
       "      <th>form</th>\n",
       "      <th>lemma</th>\n",
       "      <th>relation</th>\n",
       "      <th>ref</th>\n",
       "      <th>presentation_after</th>\n",
       "      <th>insertionID</th>\n",
       "      <th>artificial</th>\n",
       "      <th>gloss</th>\n",
       "      <th>sem</th>\n",
       "      <th>slash</th>\n",
       "      <th>postag</th>\n",
       "      <th>pos</th>\n",
       "      <th>person</th>\n",
       "      <th>number</th>\n",
       "      <th>tense</th>\n",
       "      <th>mood</th>\n",
       "      <th>voice</th>\n",
       "      <th>gender</th>\n",
       "      <th>case</th>\n",
       "      <th>degree</th>\n",
       "      <th>strength</th>\n",
       "      <th>inflection</th>\n",
       "      <th>authWordCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>Θουκυδίδης</td>\n",
       "      <td>SBJ</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>noun</td>\n",
       "      <td>-</td>\n",
       "      <td>singular</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>masculine</td>\n",
       "      <td>nominative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>57795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>Ἀθηναῖος</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---mn-</td>\n",
       "      <td>noun</td>\n",
       "      <td>-</td>\n",
       "      <td>singular</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>masculine</td>\n",
       "      <td>nominative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>57795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>ξυνέγραψε</td>\n",
       "      <td>συγγράφω</td>\n",
       "      <td>PRED</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>v3saia---</td>\n",
       "      <td>verb</td>\n",
       "      <td>3rd</td>\n",
       "      <td>singular</td>\n",
       "      <td>aorist</td>\n",
       "      <td>indicative</td>\n",
       "      <td>active</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>57795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>τὸν</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>ATR</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>l-s---ma-</td>\n",
       "      <td>article</td>\n",
       "      <td>-</td>\n",
       "      <td>singular</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>masculine</td>\n",
       "      <td>accusative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>57795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Thucydides</td>\n",
       "      <td>tlg0003</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-460c</td>\n",
       "      <td>-400c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>1</td>\n",
       "      <td>urn:cts:greekLit:tlg0003.tlg001.perseus-grc1</td>\n",
       "      <td>1.1.1</td>\n",
       "      <td>-</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>πόλεμον</td>\n",
       "      <td>πόλεμος</td>\n",
       "      <td>OBJ_AP</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>n-s---ma-</td>\n",
       "      <td>noun</td>\n",
       "      <td>-</td>\n",
       "      <td>singular</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>masculine</td>\n",
       "      <td>accusative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>57795</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495803</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022394</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>ἐν</td>\n",
       "      <td>adv</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>---------n</td>\n",
       "      <td>preposition</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>non-inflecting</td>\n",
       "      <td>91864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495804</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022395</td>\n",
       "      <td>1022396</td>\n",
       "      <td>τῇσι</td>\n",
       "      <td>ὁ</td>\n",
       "      <td>aux</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>article</td>\n",
       "      <td>-</td>\n",
       "      <td>plural</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>feminine</td>\n",
       "      <td>dative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>inflecting</td>\n",
       "      <td>91864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495805</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022396</td>\n",
       "      <td>1022394</td>\n",
       "      <td>Ἀφέτῃσι</td>\n",
       "      <td>Ἀφέται</td>\n",
       "      <td>obl</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-p---fd--i</td>\n",
       "      <td>proper noun</td>\n",
       "      <td>-</td>\n",
       "      <td>plural</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>feminine</td>\n",
       "      <td>dative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>inflecting</td>\n",
       "      <td>91864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495806</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022397</td>\n",
       "      <td>-</td>\n",
       "      <td>ἐποιεῦντο</td>\n",
       "      <td>ποιέω</td>\n",
       "      <td>pred</td>\n",
       "      <td>-</td>\n",
       "      <td></td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>3piim----i</td>\n",
       "      <td>verb</td>\n",
       "      <td>3rd</td>\n",
       "      <td>plural</td>\n",
       "      <td>imperfect</td>\n",
       "      <td>indicative</td>\n",
       "      <td>middle</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>inflecting</td>\n",
       "      <td>91864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495807</th>\n",
       "      <td>Herodotus</td>\n",
       "      <td>tlg0016</td>\n",
       "      <td>tlg001</td>\n",
       "      <td>-5</td>\n",
       "      <td>-484c</td>\n",
       "      <td>-425c</td>\n",
       "      <td>prose</td>\n",
       "      <td>history</td>\n",
       "      <td>225174</td>\n",
       "      <td>-</td>\n",
       "      <td>8.7.2</td>\n",
       "      <td>-</td>\n",
       "      <td>1022398</td>\n",
       "      <td>1022397</td>\n",
       "      <td>ἀριθμόν</td>\n",
       "      <td>ἀριθμός</td>\n",
       "      <td>obj</td>\n",
       "      <td>-</td>\n",
       "      <td>.</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-s---ma--i</td>\n",
       "      <td>common noun</td>\n",
       "      <td>-</td>\n",
       "      <td>singular</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>masculine</td>\n",
       "      <td>accusative</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>inflecting</td>\n",
       "      <td>91864</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1489845 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             Author AuthorTLG WorkTLG Timeline StartDate EndDate Poetry/Prose  \\\n",
       "0        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "1        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "2        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "3        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "4        Thucydides   tlg0003  tlg001       -5     -460c   -400c        prose   \n",
       "...             ...       ...     ...      ...       ...     ...          ...   \n",
       "1495803   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495804   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495805   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495806   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "1495807   Herodotus   tlg0016  tlg001       -5     -484c   -425c        prose   \n",
       "\n",
       "           Genre  sentID                                         docID subdoc  \\\n",
       "0        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "1        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "2        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "3        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "4        history       1  urn:cts:greekLit:tlg0003.tlg001.perseus-grc1  1.1.1   \n",
       "...          ...     ...                                           ...    ...   \n",
       "1495803  history  225174                                             -  8.7.2   \n",
       "1495804  history  225174                                             -  8.7.2   \n",
       "1495805  history  225174                                             -  8.7.2   \n",
       "1495806  history  225174                                             -  8.7.2   \n",
       "1495807  history  225174                                             -  8.7.2   \n",
       "\n",
       "        AuthorName   wordID     head        form       lemma relation ref  \\\n",
       "0                -        1        3  Θουκυδίδης  Θουκυδίδης      SBJ   -   \n",
       "1                -        2        1    Ἀθηναῖος    Ἀθηναῖος      ATR   -   \n",
       "2                -        3        0   ξυνέγραψε    συγγράφω     PRED   -   \n",
       "3                -        4        5         τὸν           ὁ      ATR   -   \n",
       "4                -        5       10     πόλεμον     πόλεμος   OBJ_AP   -   \n",
       "...            ...      ...      ...         ...         ...      ...  ..   \n",
       "1495803          -  1022394  1022397          ἐν          ἐν      adv   -   \n",
       "1495804          -  1022395  1022396        τῇσι           ὁ      aux   -   \n",
       "1495805          -  1022396  1022394     Ἀφέτῃσι      Ἀφέται      obl   -   \n",
       "1495806          -  1022397        -   ἐποιεῦντο       ποιέω     pred   -   \n",
       "1495807          -  1022398  1022397     ἀριθμόν     ἀριθμός      obj   -   \n",
       "\n",
       "        presentation_after insertionID artificial gloss sem slash      postag  \\\n",
       "0                        -           -          -     -   -     -   n-s---mn-   \n",
       "1                        -           -          -     -   -     -   n-s---mn-   \n",
       "2                        -           -          -     -   -     -   v3saia---   \n",
       "3                        -           -          -     -   -     -   l-s---ma-   \n",
       "4                        -           -          -     -   -     -   n-s---ma-   \n",
       "...                    ...         ...        ...   ...  ..   ...         ...   \n",
       "1495803                              -          -     -   -     -  ---------n   \n",
       "1495804                              -          -     -   -     -  -p---fd--i   \n",
       "1495805                              -          -     -   -     -  -p---fd--i   \n",
       "1495806                              -          -     -   -     -  3piim----i   \n",
       "1495807                 .            -          -     -   -     -  -s---ma--i   \n",
       "\n",
       "                 pos person    number      tense        mood   voice  \\\n",
       "0               noun      -  singular          -           -       -   \n",
       "1               noun      -  singular          -           -       -   \n",
       "2               verb    3rd  singular     aorist  indicative  active   \n",
       "3            article      -  singular          -           -       -   \n",
       "4               noun      -  singular          -           -       -   \n",
       "...              ...    ...       ...        ...         ...     ...   \n",
       "1495803  preposition      -         -          -           -       -   \n",
       "1495804      article      -    plural          -           -       -   \n",
       "1495805  proper noun      -    plural          -           -       -   \n",
       "1495806         verb    3rd    plural  imperfect  indicative  middle   \n",
       "1495807  common noun      -  singular          -           -       -   \n",
       "\n",
       "            gender        case degree strength      inflection  authWordCount  \n",
       "0        masculine  nominative      -        -               -          57795  \n",
       "1        masculine  nominative      -        -               -          57795  \n",
       "2                -           -      -        -               -          57795  \n",
       "3        masculine  accusative      -        -               -          57795  \n",
       "4        masculine  accusative      -        -               -          57795  \n",
       "...            ...         ...    ...      ...             ...            ...  \n",
       "1495803          -           -      -        -  non-inflecting          91864  \n",
       "1495804   feminine      dative      -        -      inflecting          91864  \n",
       "1495805   feminine      dative      -        -      inflecting          91864  \n",
       "1495806          -           -      -        -      inflecting          91864  \n",
       "1495807  masculine  accusative      -        -      inflecting          91864  \n",
       "\n",
       "[1489845 rows x 37 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replaces single letter identifiers with readable labels\n",
    "master_df['pos'].replace(POSes, inplace=True)\n",
    "master_df['person'].replace(persons, inplace=True)\n",
    "master_df['number'].replace(numbers, inplace=True)\n",
    "master_df['tense'].replace(tenses, inplace=True)\n",
    "master_df['mood'].replace(moods, inplace=True)\n",
    "master_df['voice'].replace(voices, inplace=True)\n",
    "master_df['gender'].replace(genders, inplace=True)\n",
    "master_df['case'].replace(cases, inplace=True)\n",
    "master_df['degree'].replace(degrees, inplace=True)\n",
    "master_df['strength'].replace(strengths, inplace=True)\n",
    "master_df['inflection'].replace(inflections, inplace=True)\n",
    "\n",
    "# includes total word counts across authors\n",
    "master_df['authWordCount'] = master_df.groupby('Author')['Author'].transform('count')\n",
    "\n",
    "master_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "FILENAME = os.path.join(FILEPATH, 'allauthors.csv')\n",
    "master_df.to_csv(FILENAME, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
